Setting the options for knitr…
library(knitr)
knitr::opts_chunk$set(results = 'asis',
echo = TRUE,
comment = NA,
prompt = FALSE,
cache = FALSE,
warning = FALSE,
message = FALSE,
fig.align = "center",
fig.width = 8.125,
out.width = "100%",
fig.path = "Figures/figures_umap/",
dev = c('png', 'tiff'),
dev.args = list(png = list(type = "cairo"), tiff = list(compression = 'lzw')),
cache = TRUE,
cache.path = "D:/cache/cache_umap/",
autodep = TRUE)
options(width = 1000, scipen = 999, knitr.kable.NA = '')Loading libraries…
library(tidyverse)
library(gridExtra)
library(kableExtra)
library(uwot) # To compute UMAP
#require(devtools)
#install_version("clues", version = "0.6.2.2", repos = "http://cran.us.r-project.org")
library(clues) # For silhouettes and silhouette scores
library(parallel)Cleaning the memory…
rm(list = ls(all.names = TRUE))Loading data tables and definitions of features sets prepared during the curation phase…
load(file = "Study_objects.RData")We define the function get_umap_and_silhouette() which
takes as input a data frame (or tibble) which for each predicted
observation contains i) either the values of variables describing the
observation or the probabilities of each possible output (individual or
call type) as returned by a classifier for the observation, ii) the
individual, iii) the call type and iv) the call id… The one_hot
parameter of the function specifies whether we ‘one-hot encode’ the
probabilities returned by a classifier (meaning that the highest
probability gets the value 1 and all the others the value 0). This only
makes sense when one provides such probabilities, and not raw values for
variables describing the calls.
get_umap_and_silhouette <- function(df_umap, target_DV, target_algo, target_set, one_hot = F, n_neighbors = 15, metric = "euclidean", min_dist = 0.01, size = 0.5, alpha = 0.5) {
set.seed(123) # To ensure reproducibility
df_umap <- df_umap %>%
filter(algo == target_algo, set == target_set) %>%
dplyr::select(-algo, -set)
# We assume that we only consider individual vocal signature and call type
if (target_DV == "individual")
other_V <- "type"
if (target_DV == "type")
other_V <- "individual"
# We create a matrix which contains the numerical values we need for the umap
m <- umap_coords <- df_umap %>%
dplyr::select(-id, -individual, -type) %>%
as.matrix()
# If we 'one-hot' encode the values, we transform the matrix row-wise
if (one_hot)
m <- apply(m, 1, function(v) { as.integer(v == max(v)) }) %>% t()
# We compute a UMAP and turn the results (X and Y coordinates) into a tibble with additional information about individual and call type
umap_coords <- m %>%
umap(n_neighbors, n_components = 2, metric = metric, min_dist = min_dist, y = df_umap %>% pull(target_DV), n_threads = 20 - 2) %>%
data.frame() %>%
as_tibble() %>%
mutate(individual = df_umap %>% pull(individual),
type = df_umap %>% pull(type))
# umap_coords <- m %>%
# umap(n_neighbors, n_components = 2, metric = metric, min_dist = min_dist, n_threads = detectCores() - 2) %>%
# data.frame() %>%
# as_tibble() %>%
# mutate(individual = df_umap %>% pull(individual),
# type = df_umap %>% pull(type))
# We create a ggplot figure to display the UMAP
p_umap <- umap_coords %>% ggplot(aes(x = X1, y = X2, col = .data[[target_DV]])) +
geom_point(size = size, alpha = alpha) +
scale_color_brewer(palette = "Paired") +
guides(colour = guide_legend(override.aes = list(size = 2, alpha = 1.0))) +
ggtitle(paste0("UMAP - ", target_algo, "\n", target_set))
# We compute the silhouette for the output of the UMAP
silhouette <- get_Silhouette(as.matrix(umap_coords[, 1:2]), umap_coords[[target_DV]], disMethod = "Euclidean")
silhouette.df <- data.frame(sil = silhouette$s, observed = umap_coords[[target_DV]])
silhouette.df <- silhouette.df[order(silhouette.df$observed, -silhouette.df$sil), ]
silhouette.df$name <- factor(rownames(silhouette.df), levels = rownames(silhouette.df))
# We compute mean silhouette scores per group, i.e., per level of the target DV
means.per.group <- silhouette.df %>%
mutate(name = as.integer(name)) %>%
group_by(observed) %>%
summarise(mean.group = round(mean(sil), 2), mean.name = mean(name), min = name[which.min(name)], max = name[which.max(name)])
# We create a ggplot figure to display the silhouette
sil_plot <- silhouette.df %>%
ggplot(., aes(x = name, y = sil, color = observed, fill = observed)) +
geom_col() +
labs(y = "Silhouette value",
x = "",
fill="Cluster",
color="Cluster",
title = paste0("Silhouette plot - ", target_algo, "\n", target_set),
subtitle = paste0("Average silhouette value: ", round(mean(silhouette.df$sil), 2)) ) +
scale_color_brewer(palette = "Paired") +
scale_fill_brewer(palette = "Paired") +
ggplot2::ylim(c(-1, 1)) +
#geom_hline(yintercept = mean(silhouette.df$sil), linetype = "dashed", color = "red", size=0.75) +
geom_text(data=means.per.group, aes(x=mean.name, y=mean.group, label= mean.group), color="black", hjust = -0.2) +
geom_segment(data = means.per.group, aes(x = min - 0.5, xend = max + 0.5, y = mean.group, yend = mean.group), size=0.75, linetype = "dashed", color="black") +
theme(axis.text.y = element_blank(), axis.ticks.y = element_blank()) +
coord_flip() +
guides(fill="none", color="none")
# We return the whole silhouette as well as the two ggplot figures
return (list(umap = p_umap, silhouette = silhouette, silhouette_plot = sil_plot))
}We first define a function build_probability_data() to build the data tables we need to create UMAP and silhouettes. Given a filename, this function loads an .RData file which contains all the outputs generated previously with various classifiers and feature sets. It extracts the prediction probabilities of the classifiers and then builds and returns a data table with the required information…
build_probability_data <- function(filename) {
load(filename)
probabilities <- all_outputs %>% purrr::map_dfr("prediction_probabilities")
sets <- all_outputs %>% purrr::map("sets")
test_sets <- sets %>% purrr::map("testing_ids")
nb_per_repetition <- probabilities %>% filter(rep == 1) %>% nrow()
reduced_df <- df %>% select(id, individual, type)
all_ids <- c()
for (i in 1:length(test_sets)) {
ids <- test_sets[[i]] %>% as.character()
nb_rep <- nb_per_repetition / length(ids)
all_ids <- c(all_ids, rep(ids, nb_rep))
}
probabilities <- probabilities %>%
mutate(id = all_ids) %>%
select(-true_value, -DV, -rep) %>%
mutate(id = as.factor(id), algo = as.factor(algo), set = as.factor(set)) %>%
inner_join(reduced_df, by = c("id" = "id"))
return (probabilities)
}We then call the previous function twice to prepare data tables for individual signatures and call types, as they were classified with SVM, NN, xgboost and ensemble learners. We also load a file which contains the appropriate data for the pDFA with both individual signatures and call types. first load the results of the assessments of the different models, and create different data tables…
probabilities_individual <- build_probability_data("Result files/results - individual - default.RData")
probabilities_type <- build_probability_data("Result files/results - type - default.RData")
pdfa_data <- readRDS("Result files/pdfa-permuted-probabilities.rds")In the following sub-sections, we repeat the same approach for different configurations: we consider first individual signatures then call types, and, for each of these target variables, get UMAP and silhouettes for i) the raw descriptions of the calls with the features from the three sets ‘Bioacoustic’, ‘DCT’ and ‘MFCC’, ii) the predictions of the pDFA with the ‘Bioacoustic’ feature set, iii) the predictions of the best stacked learner. Once all figures have been created, we assemble them in a single figure used for the publication.
Computing the UMAP and silhouette…
target_columns <- c("id", "individual", "type", Bioacoustic_set, DCT_set, MFCC_set)
df_umap <- df %>%
dplyr::select(one_of(target_columns)) %>%
mutate(algo = "Raw features", set = "Bioacoustic, DCT, MFCC")
initial_data_ind <- df_umap %>% get_umap_and_silhouette("individual", "Raw features", "Bioacoustic, DCT, MFCC", size = 1)Displaying the UMAP…
initial_data_ind$umapDisplaying the silhouette…
initial_data_ind$silhouette_plotComputing the UMAP and silhouette…
df_umap <-
pdfa_data$individual$Bioacoustic %>%
as_tibble() %>%
rename(individual = "observed", type = "control", id = "rowname") %>%
mutate(algo = "pDFA", set = "Bioacoustic")
pdFA_ind <- df_umap %>% get_umap_and_silhouette("individual", "pDFA", "Bioacoustic", one_hot = T)Displaying the UMAP…
pdFA_ind$umapDisplaying the silhouette…
pdFA_ind$silhouette_plotComputing the UMAP and silhouette…
best_ensemble_ind <- probabilities_individual %>% get_umap_and_silhouette("individual", "ensemble", "3 classifiers x 3 sets", one_hot = T)Displaying the UMAP…
best_ensemble_ind$umapDisplaying the silhouette…
best_ensemble_ind$silhouette_plotgrid.arrange(
initial_data_ind$umap, pdFA_ind$umap, best_ensemble_ind$umap,
initial_data_ind$silhouette_plot, pdFA_ind$silhouette_plot, best_ensemble_ind$silhouette_plot,
nrow = 2, ncol = 3,
heights = c(0.5, 1)
)Computing the UMAP and silhouette…
target_columns <- c("id", "individual", "type", Bioacoustic_set, DCT_set, MFCC_set)
df_umap <- df %>%
select(one_of(target_columns)) %>%
mutate(algo = "Raw features", set = "Bioacoustic, DCT, MFCC")
initial_data_type <- df_umap %>% get_umap_and_silhouette("type", "Raw features", "Bioacoustic, DCT, MFCC", size = 1)Displaying the UMAP…
initial_data_type$umapDisplaying the silhouette…
initial_data_type$silhouette_plotComputing the UMAP and silhouette…
df_umap <- pdfa_data$type$Bioacoustic %>%
as_tibble() %>%
rename(individual = "control", type = "observed", id = "rowname") %>%
mutate(algo = "pDFA", set = "Bioacoustic")
pdFA_type <- df_umap %>% get_umap_and_silhouette("type", "pDFA", "Bioacoustic", one_hot = T)Displaying the UMAP…
pdFA_type$umapDisplaying the silhouette…
pdFA_type$silhouette_plotComputing the UMAP and silhouette…
best_ensemble_type <- probabilities_type %>% get_umap_and_silhouette("type", "ensemble", "3 classifiers x 3 sets", one_hot = T)Displaying the UMAP…
best_ensemble_type$umapDisplaying the silhouette…
best_ensemble_type$silhouette_plotgrid.arrange(
initial_data_type$umap, pdFA_type$umap, best_ensemble_type$umap,
initial_data_type$silhouette_plot, pdFA_type$silhouette_plot, best_ensemble_type$silhouette_plot,
nrow = 2, ncol = 3,
heights = c(0.5, 1)
)cat(utils::sessionInfo()$R.version$version.string, "<br />", sep = "")R version 4.1.3 (2022-03-10)
cat("Platform: ", utils::sessionInfo()$platform, "<br />", sep = "")Platform: x86_64-w64-mingw32/x64 (64-bit)
cat("OS version: ", utils::sessionInfo()$running, "<br />", sep = "")OS version: Windows 10 x64 (build 22000)
packages.base <- utils::sessionInfo()$basePkgs
packages.others <- names(utils::sessionInfo()$otherPkgs)
packages <- c(packages.base, packages.others)
for(i in 1:length(packages)) {
reference <- cat("`",
packages[i],
"`: ",
"*",
packageDescription(packages[i])$Title,
"* version ",
packageDescription(packages[i])$Version,
"<br />",
sep = "")
reference
}parallel: Support for Parallel computation in R
version 4.1.3stats: The R Stats Package
version 4.1.3graphics: The R Graphics
Package version 4.1.3grDevices: The R
Graphics Devices and Support for Colours and Fonts version
4.1.3utils: The R Utils Package version
4.1.3datasets: The R Datasets Package
version 4.1.3methods: Formal Methods and
Classes version 4.1.3base: The R Base
Package version 4.1.3clues: Clustering
Method Based on Local version 0.6.2.2uwot:
The Uniform Manifold Approximation and Projection (UMAP) Method for
Dimensionality Reduction version 0.1.11Matrix:
Sparse and Dense Matrix Classes and Methods version
1.4-1gridExtra: Miscellaneous Functions for
“Grid” Graphics version 2.3forcats: Tools
for Working with Categorical Variables (Factors) version
0.5.1stringr: Simple, Consistent Wrappers for
Common String Operations version 1.4.0purrr:
Functional Programming Tools version
0.3.4readr: Read Rectangular Text Data
version 2.1.2tidyr: Tidy Messy Data version
1.2.0tibble: Simple Data Frames version
3.1.6ggplot2: Create Elegant Data Visualisations
Using the Grammar of Graphics version
3.3.5tidyverse: Easily Install and Load the
‘Tidyverse’ version 1.3.1dplyr: A Grammar of
Data Manipulation version 1.0.8knitr: A
General-Purpose Package for Dynamic Report Generation in R version
1.37
Département des arts, des lettres et du langage, Université du Québec à Chicoutimi, Chicoutimi, Canada↩︎
Université de Lyon, CNRS, Laboratoire Dynamique Du Langage, UMR 5596, Lyon, France↩︎
ENES Bioacoustics Research Laboratory, University of Saint Étienne, CRNL, CNRS UMR 5292, INSERM UMR_S 1028, Saint-Étienne, France↩︎
Département des arts, des lettres et du langage, Université du Québec à Chicoutimi, Chicoutimi, Canada↩︎
ENES Bioacoustics Research Laboratory, University of Saint Étienne, CRNL, CNRS UMR 5292, INSERM UMR_S 1028, Saint-Étienne, France↩︎
ENES Bioacoustics Research Laboratory, University of Saint Étienne, CRNL, CNRS UMR 5292, INSERM UMR_S 1028, Saint-Étienne, France↩︎
Department of Linguistics, The University of Hong Kong, Hong Kong, China, Corresponding author↩︎